####################################################################################

#Detecting Misinformation: Identifying False News Spread by Political Leaders in the Global 
#This file creates the post level analysis for Figures 1 and 3 and 6 and Tables 1 and 2 and 3
#and data cited in the paper. It also includes several online appendix analyses.
#****DUE TO THE PRIVACY RESTRICTIONS WE ARE UNABLE TO SHARE POST LEVEL DATA

####################################################################################

rm(list=ls())

options(scipen=999) # suppressing scientific notation

pkgs <- c("tidyverse","stm", "stringr", "topicmodels", "tm","reshape2", "pals", "stopwords", "xtable",
          "text2vec", "stopwords", "readxl", "qdapRegex", "stringi", "hrbrthemes", "patchwork")

sapply(pkgs, require, character.only = TRUE)

#load post level dataset -- DATA UNAVAILABLE DUE TO PRIVACY RESTRICTIONS
post <- read_rds("Data/final-post-dat.rds") 

#========================================================================
#Figure 1: Overlap in posts identified as sharing false content by detection approach
table(post$fake_post, post$base_url_fake) #text approach vs. url approach


#========================================================================
#Figure 3: Topics flagged by Structural Topic Models

# Format and transform columns
post$year <- as.factor(post$year)
post$platform <- as.factor(post$platform)

#remove any additional url based character strings from text column
post_topic <- post %>%
  mutate(all_text = gsub("https", "", all_text))%>%
  mutate(all_text = gsub("mblnew", "", all_text))%>%
  mutate(all_text = gsub("www", "", all_text))

#============
#create two dataframes:
#============
#text flagged, domain not flagged
text_no_domain <- post_topic %>%
  filter(fake_post == 1 & base_url_fake == 0)

#domain flagged, text not flagged
domain_no_text <- post_topic %>%
  filter(fake_post == 0 & base_url_fake == 1)

#======================
#preprocess data for text-based topic model
#======================


# Create tokens
processed_text_no_domain <-textProcessor(text_no_domain$all_text, metadata = text_no_domain,
                                         lowercase = TRUE, #*
                                         removestopwords = TRUE, #*
                                         removenumbers = TRUE, #*
                                         removepunctuation = TRUE, #*
                                         stem = TRUE, #*
                                         wordLengths = c(3,Inf), #*
                                         sparselevel = 1, #*
                                         language = "pt", #*
                                         verbose = TRUE, #*
                                         onlycharacter = TRUE, # not def
                                         striphtml = FALSE, #*
                                         customstopwords = NULL, #*
                                         v1 = FALSE)

# filter out terms that don’t appear in more than 10 documents
out_text_no_domain <- prepDocuments(processed_text_no_domain$documents, processed_text_no_domain$vocab, 
                                    processed_text_no_domain$meta, lower.thresh=10)

docs <- out_text_no_domain$documents
vocab <- out_text_no_domain$vocab
meta <-out_text_no_domain$meta


#identify the appropriate number of topics between 3 and 15 topics
set.seed(123)

system.time(
  text_topics_test <- searchK(documents = docs,
                              vocab = vocab,
                              K = seq(3,15, by=2),
                              N = 200, 
                              proportion = 0.5, 
                              heldout.seed = 1234, 
                              M = 10, 
                              cores = 1, # 
                              prevalence =~ Platform + year,
                              max.em.its = 75,
                              data = meta,
                              init.type = "Spectral",
                              verbose=TRUE)
)

#=============================================================================
# print appendix figure 24: Diagnostics for Structural Topic Model for Posts Identified with the Text-Based Approach
png("Figures/text-based-k-diagnostics.png", width=10, height=6, units="in", res=300)

plot(text_topics_test)
dev.off()

#==================================================
#code for appendix figure 25: Comparing Average Exclusivity and Average Semantic Coherence at Different 
#Ks for Posts Identified with the Text-Based Approach

# specify models with different k
stm_diffk_text <- tibble(K = c(7, 9,11,13,15)) %>% 
  mutate(model = map(K, ~ stm(docs,
                              vocab,
                              data=meta,
                              prevalence =~ platform + year,
                              K = .,
                              verbose = FALSE)))

# calculate exclusivity + semantic coherence
model_scores_text <- stm_diffk_text %>% 
  mutate(exclusivity = map(model, exclusivity),
         semantic_coherence = map(model, semanticCoherence, docs)) %>% 
  select(K, exclusivity, semantic_coherence)

# plot means for better overview
png("Figures/text-based-coherence-exclusivity.png", width=10, height=6, units="in", res=300)

model_scores_text %>% 
  unnest(c(exclusivity, semantic_coherence)) %>% 
  group_by(K) %>% 
  summarize(exclusivity = mean(exclusivity),
            semantic_coherence = mean(semantic_coherence)) %>% 
  ggplot(aes(x = semantic_coherence, y = exclusivity, color = as.factor(K))) +
  geom_point() +
  theme_classic() +
  labs(x = "Semantic coherence",
       y = "Exclusivity",
       #title = "Comparing average exclusivity and average semantic coherence",
       subtitle = " ") +
  scale_color_brewer(name = "Topics (K)", palette = "Dark2")
dev.off()

#=================================
#run topic model with 11 topics based on diagnostics from figures 24 and 25
#=================================
system.time({
  stm_text <- stm(docs, vocab, 11,
                  prevalence =~ platform + year,
                  data = meta,
                  seed = 15, max.em.its = 5
  )
})

labelTopics(stm_text)

#============================================
#get top words for Appendix Table 6: Top Frequent and Exclusive Words by Topic for Text-Based Detection Approach
words_stm_text <- as.data.frame(t(labelTopics(stm_text, n = 10)$frex))

xtable(words_stm_text, type = "latex", file = "top-words-stm-text.tex")

#==============================================
#plot topics: Figure 3 - Topics flagged by Structural Topic Models (Text-based approach)
png("Figures/text-based-stm.png", width=11, height=6, units="in", res=300)

plot(stm_text, type = "summary", labeltype = "frex", n= 5,  main = "Top Topics for Text-Based False Content", 
     xlab = "Expected Topic Proportions")

dev.off()


#======================
#preprocess data for domain-based topic model
#======================

# Create tokens
processed_domain_no_text <-textProcessor(domain_no_text$all_text, metadata = domain_no_text,
                                         lowercase = TRUE, #*
                                         removestopwords = TRUE, #*
                                         removenumbers = TRUE, #*
                                         removepunctuation = TRUE, #*
                                         stem = TRUE, #*
                                         wordLengths = c(3,Inf), #*
                                         sparselevel = 1, #*
                                         language = "pt", #*
                                         verbose = TRUE, #*
                                         onlycharacter = TRUE, # not def
                                         striphtml = FALSE, #*
                                         customstopwords = NULL, #*
                                         v1 = FALSE)

# filter out terms that don’t appear in more than 10 documents,
out_domain_no_text <- prepDocuments(processed_domain_no_text$documents, processed_domain_no_text$vocab, 
                                    processed_domain_no_text$meta, lower.thresh=10)

docs <- out_domain_no_text$documents
vocab <- out_domain_no_text$vocab
meta <-out_domain_no_text$meta

#identify the appropriate number of topics between 3 and 15 topics
set.seed(123)

system.time(
  domain_topics_test <- searchK(documents = docs,
                                vocab = vocab,
                                K = seq(3,15, by=2),
                                N = 200, 
                                proportion = 0.5, 
                                heldout.seed = 1234, 
                                M = 10, 
                                cores = 1,
                                prevalence =~ platform + year,
                                max.em.its = 75,
                                data = meta,
                                init.type = "Spectral",
                                verbose=TRUE)
)
#=============================================================================
# print appendix figure 26: Diagnostics for Structural Topic Model for Posts Identified with the Domain-Based Approach
png("Figures/domain-based-k-diagnostics.png", width=10, height=6, units="in", res=300)

plot(domain_topics_test)
dev.off()

#==================================================
#code for appendix figure 27: Comparing Average Exclusivity and Average Semantic Coherence at Different 
#Ks for Posts Identified with the Domain-Based Approach

# specify models with different k
stm_diffk_domain <- tibble(K = c(7, 9,11,13,15)) %>% 
  mutate(model = map(K, ~ stm(docs,
                              vocab,
                              data=meta,
                              prevalence =~ platform + year,
                              K = .,
                              verbose = FALSE)))



# calculate exclusivity + semantic coherence
model_scores_domain <- stm_diffk_domain %>% 
  mutate(exclusivity = map(model, exclusivity),
         semantic_coherence = map(model, semanticCoherence, docs)) %>% 
  select(K, exclusivity, semantic_coherence)

# plot means for better overview
png("Figures/domain-based-coherence-exclusivity.png", width=10, height=6, units="in", res=300)
model_scores_domain %>% 
  unnest(c(exclusivity, semantic_coherence)) %>% 
  group_by(K) %>% 
  summarize(exclusivity = mean(exclusivity),
            semantic_coherence = mean(semantic_coherence)) %>% 
  ggplot(aes(x = semantic_coherence, y = exclusivity, color = as.factor(K))) +
  geom_point()  +
  theme_classic() +
  labs(x = "Semantic coherence",
       y = "Exclusivity",
       #title = "Comparing average exclusivity and average semantic coherence",
       subtitle = " ") +
  scale_color_brewer(name = "Topics (K)", palette = "Dark2")

dev.off()

#=================================
#run topic model with 9 topics based on diagnostics from figures 26 and 27
#=================================

system.time({
  stm_domain <- stm(docs, vocab, 9,
                    prevalence =~ platform + year,
                    data = meta,
                    seed = 15, max.em.its = 5
  )
})

labelTopics(stm_domain)

#============================================
#get top words for Appendix Table 7: Top Frequent and Exclusive Words by Topic for Domain-Based Detection Approach
#FREX: are the words that are both frequent and exclusive, 
#identifying words that distinguish topics. 

words_stm_domain <- as.data.frame(t(labelTopics(stm_domain, n = 10)$frex))
xtable(words_stm_domain, type = "latex", file = "top-words-stm-domain.tex")

#==============================================
#plot topics: Figure 3 - Topics flagged by Structural Topic Models (Domain-based approach)

png("Figures/domain-based-stm.png", width=11, height=6, units="in", res=300)

plot(stm_domain, type = "summary", n= 5,  labeltype = "frex", main = "Top Topics for Domain-Based False Content", 
     xlab = "Expected Topic Proportions")
dev.off()


#============
#Table 1: Process for Identifying False Content Using the Text-Based Approach

#column 2: Total Number of Posts Left
#full dataset
full_df <- nrow(post)
#pred prob
pred_prob <- sum(post$pred_prob)
#cosine
cosine <- sum(post$cosine_above)
#text-based
text_approach <- sum(post$fake_post)

#column 3: Proportion of Original Dataset
full_df_prop <- full_df/nrow(post)
pred_prob_prop <- pred_prob/nrow(post)
cosine_prop <- cosine/nrow(post)
text_approach_prop <- text_approach/nrow(post)

table1_dat <- as.data.frame(matrix(
  c("Raw data: All politicians' posts", "Naive Bayes classification model: Posts linguistically unlike real news",
    "Cosine similarity $>$.4: Posts that are linguistically similar to false claims", "Manually reviewed: Posts that are false", 
    full_df, pred_prob, cosine, text_approach, 
    full_df_prop, pred_prob_prop, cosine_prop, text_approach_prop), 4, 3)) 



#final table 1
table1 <- xtable(table1_dat, digits = 3)
print(table1, file = "Tables/table1.tex")

#============
#Figure 6: Distribution of Posts Classified as False with a Predicted Probability 
#between .5 and .9 and their Highest Cosine Matches with False Claims

fb_dat <-read_csv("Data/pred-prob-facebook_all.csv")%>% #raw predicted probabilities by post from classification model
  filter(prob_fake > .5 & prob_fake < .9)

ig_dat <-read_csv("Data/pred-prob-instagram_all.csv")%>%
  filter(prob_fake > .5 & prob_fake < .9)

twitter_dat <-read_csv("Data/pred-prob-twitter_all.csv")%>%
  filter(prob_fake > .5 & prob_fake < .9)

set.seed(123) #set seed for replication

#random sample posts by platform and year (~300 total posts)
fb_dat_sample <- fb_dat %>%
  group_by(year) %>%
  sample_n(34)%>%
  ungroup() %>%
  select(year, all_text, prob_fake, "url" = URL) %>%
  mutate(platform = "Facebook")

ig_dat_sample <- ig_dat %>%
  group_by(year) %>%
  sample_n(34)%>%
  ungroup() %>%
  select(year, all_text, prob_fake, "url" = URL) %>%
  mutate(platform = "Instagram")

twitter_dat_sample <- twitter_dat %>%
  group_by(year) %>%
  sample_n(34) %>%
  ungroup() %>%
  select(year, "all_text" = text, prob_fake, url) %>%
  mutate(platform = "Twitter")

#combine
all_platform_sample <- rbind.data.frame(fb_dat_sample, ig_dat_sample, twitter_dat_sample)

#==================
#calculate cosine similarity for random selection of posts between .5 and .9 pred probability of being false

#load list of stopwords
stopwords_regex <- paste0('\\b', paste(rev(stopwords('pt')), collapse = '\\b|\\b'), '\\b')

# function for cleaning text
clean_text <- function(data_column) {
  #remove "	Boato – " from column
  #remove urls
  data_column = rm_url(data_column, pattern=pastex("@rm_twitter_url", "@rm_url"))
  # make text lower case
  data_column = str_to_lower(data_column)
  # remove non-alpha symbols
  data_column = str_replace_all(data_column, "[^[:alpha:]]", " ")
  #remove portuguese stopwords like de, do and da
  data_column = str_replace_all(data_column, stopwords_regex, " ")
  #remove accents
  data_column = stri_trans_general(str = data_column, id = "Latin-ASCII")
  #remove single letter words
  data_column = str_replace_all(data_column, "\\b\\w{1,1}\\s", "")
  # collapse multiple spaces
  data_column = str_replace_all(data_column, "\\s+", " ")
  #remove white space from front and back
  data_column = str_trim(data_column, side = "both")
}

#df with original text of fact check claims 
boatos <- read_excel("boatos-fact-check-text.xlsx", sheet= 1) %>%
  #separate posts with multiple story versions
  mutate(text = strsplit(as.character(text), "Versão")) %>% 
  unnest(text) %>%
  #clean all text
  mutate(description_clean = clean_text(text)) %>%
  #count unique words in message clean
  mutate(total_message = sapply(description_clean, function(x) length(unlist(strsplit(as.character(x), "\\W+"))))) %>%
  #message must be more than 10 words
  filter(total_message >10) %>% 
  mutate(row_id = 1:n()) 

#clean sample post text 
all_platform_sample <- all_platform_sample %>%
  mutate(description_clean = clean_text(all_text)) %>%
  filter(description_clean != "")

#create storage for calculating cosine similarity between sample posts and false claims text
max_cosine2 <- list()
row_id2 <- list()
#look through posts to compare documents
for(i in 1:nrow(all_platform_sample)){
  #select   post
  it2 <- itoken(all_platform_sample$description_clean[i], progressbar = FALSE)
  #create full dataframe (fact check sample plus single post)
  full_dat <- c(all_platform_sample$description_clean[i], boatos$description_clean)
  
  full_dat <- data.frame(description_clean = full_dat)
  
  it <- itoken(full_dat$description_clean, progressbar = FALSE)
  #get vocabulary used across full set
  v <- create_vocabulary(it)
  #remove words only used once across full set and vectorize
  v <- prune_vocabulary(v, doc_proportion_max = 0.1, term_count_min = 2)
  vectorizer <- vocab_vectorizer(v)
  
  #create document term matrix for fact check docs
  dtm1 <- create_dtm(it, vectorizer)
  tfidf <- TfIdf$new()
  dtm1_tfidf <- fit_transform(dtm1, tfidf)
  
  #calculate cosine similarity between single post and every fact checked story
  d1_cos_sim_tfidf <- sim2(dtm1_tfidf, method = "cosine", norm = "l2")
  #get the highest cosine similarity to note the greatest relationship between SM post and fact-checked post
  max_cosine2[i] <- d1_cos_sim_tfidf[1, ][kit::topn(d1_cos_sim_tfidf[1, ], 2L,decreasing = T)[2L]]
  row_id2[i] <- colnames(d1_cos_sim_tfidf)[d1_cos_sim_tfidf[1, ] == max_cosine2[i]]
  print(paste0("Starting next post: ", i+1))
  
}

#add new column with max cosine to df
all_platform_sample$max_cosine_tfidf <- as.numeric(max_cosine2)

all_platform_sample$row_id_tfidf <- as.numeric(row_id2) 

boatos$row_id <-boatos$row_id + 1 #add +1 to include the fact that the matrix included and identity row/column

#create final dataframe
final_df <- left_join(all_platform_sample, boatos, by = c("row_id_tfidf" = "row_id")) %>%
  distinct() # merge together to review SM text and fact check text side by side


#generate figure 6
final_df_figures <- final_df %>%
  select(post_id, prob_fake, max_cosine_tfidf) %>%
  pivot_longer(-post_id, names_to = "type", values_to= "value") %>%
  mutate(type = if_else(type == "prob_fake", "Probability of Being False (Classification Model)", "Similarity to False Text(Cosine Similarity)"))


fig6a <- final_df_figures %>%
  filter(type == "Probability of Being False (Classification Model)") %>%
  ggplot(., aes(value)) + 
  geom_histogram(bins = 20)+
  theme_ipsum() + 
  theme(legend.position = "top") +
  geom_vline(xintercept = .499, linetype = "dashed", color = "red") +
  xlim(0, 1) +
  geom_vline(xintercept = .92, linetype = "dashed", color = "red") +
  labs(subtitle = "Probability of Being False (Classification Model) between .5 and .9",
       x = "Probability", 
       y = "Count")

fig6b <- final_df_figures %>%
  filter(type == "Similarity to False Text(Cosine Similarity)") %>%
  ggplot(., aes(value)) + 
  geom_histogram(bins = 20)+
  theme_ipsum() + 
  theme(legend.position = "top") +
  geom_vline(xintercept = .4, linetype = "dashed", color = "red") +
  xlim(0, 1) +
  labs(subtitle = "Similarity to False Text (Cosine Similarity) <.4",
       x = "Cosine Similarity", 
       y = "Count")

fig_59 <- fig6a+fig6b

ggsave("Figures/rr-pred-prob-59.png", fig_59, width = 10)

#==============================================
#Table 3: Percent false positives and false negatives from sampled data for posts that were verifiable by external fact-checkers

#==================================
# sample posts three different ways
#==================================

set.seed(123)

#sample 1: 500 of overall sample (naive approach) -- stratified by year (NAIVE)
all_fake_sample1 <- post %>%
  group_by(year) %>%
  sample_n(167)

write_csv(all_fake_sample1, "Data/sample1-naive.csv")

#sample 2: flagged domain and not flagged domain -- stratified by year (DOMAIN)
all_fake_sample2 <- post %>%
  group_by(year, base_url_fake) %>%
  sample_n(84)

write_csv(all_fake_sample2, "Data/sample2-domain.csv")


#sample 3: flagged text and not flagged text -- stratified by year (TEXT)
all_fake_sample3 <- post %>%
  group_by(year, fake_post) %>%
  sample_n(84)

write_csv(all_fake_sample3, "Data/All SM/sample3-text.csv")

#===========================================================
#samples of posts were then sent to Boatos for fact checking after hiding columns denoting which approach they were flagged by

#load original samples
sample1 <- read_csv("Data/sample1-naive.csv")
sample2 <- read_csv("Data/sample2-domain.csv")
sample3 <- read_csv("Data/sample3-text.csv")

#SAMPLE 1: NAIVE (N = 501)

#load boatos coding of sample 1 with relevant review columns
boatos_sample1 <- read_csv("Data/boatos-coding-sample1-naive.csv") %>%
  select(URL, `Verifiable?`, update_verdict, `Erro`)

#merge with original sample to get text-based and domain-based classifications by URL
sample1_with_boatos <- left_join(sample1, boatos_sample1)

#boatos coding compared to domain approach
table(sample1_with_boatos$fake_domain, sample1_with_boatos$update_verdict)/nrow(sample1_with_boatos)*100 

#---no text approach to compare to

#=======================
# SAMPLE 2: DOMAIN (N = 504)

#load boatos coding of sample 2 with relevant review columns
boatos_sample2 <- read_csv("Data/boatos-coding-sample2-domain.csv", locale=locale(encoding="latin1"))%>%
  select(URL, `Verifiable?`, update_verdict, `Erro`)

#merge with original sample to get text-based and domain-based classifications by URL
sample2_with_boatos <- left_join(sample2, boatos_sample2)

#boatos coding compared to domain approach
table(sample2_with_boatos$fake_domain, sample2_with_boatos$update_verdict)/nrow(sample2_with_boatos)*100 

#---no text approach to compare to

#=======================
# SAMPLE 3: TEXT (N = 504)

#load boatos coding of sample 3 with relevant review columns
boatos_sample3 <- read_csv("Data/boatos-coding-sample3-text.csv")%>%
  select(URL, `Verifiable?`, update_verdict, `Erro`)

#merge with original sample to get text-based and domain-based classifications by URL
sample3_with_boatos <- left_join(sample3, boatos_sample3)


#boatos coding compared to domain approach
table(sample3_with_boatos$fake_domain, sample3_with_boatos$update_verdict)/nrow(sample3_with_boatos)*100 

#boatos coding compared to text approach
table(sample3_with_boatos$fake_text, sample3_with_boatos$update_verdict)/nrow(sample3_with_boatos)*100

#*****due to complexity of Table 3, table generated manually using outputs from lines 545, 560, 577, and 579
#all the numbers cited in the discussion of table 3 were created in the code that creates Table 3.

#======================================================================
#======================================================================
      ################## Data cited in paper ################## 
#======================================================================
#======================================================================

#Through this process, we include 128,073 Facebook posts, 61,986 Instagram posts, and 233,132 Twitter posts. 
#This left us with 423,191 ``suspicious'' posts across all social media sites, or approximately 10.5\% of our initial dataset. 

post %>%
  group_by(platform) %>%
  dplyr::summarize(total = sum(pred_prob))

sum(post$pred_prob)/nrow(post)

#For example, out of the 39,097 posts identified using the text- or the domain-based approach, 
#only 19 posts were identified by both approaches

post <- post %>% mutate(fake_domain = case_when(fake_post == 1 ~ 1, 
                                                base_url_fake == 1 ~ 1, 
                                                TRUE ~ 0))
table(post$fake_domain)

table(post$fake_post, post$base_url_fake)

#(1.4 million from Facebook, 1.1 million from Instagram and 1.5 million from Twitter)
table(post$platform)

#Politicians are very active users of social media, posting, on average, 6 times a day or 131 times a month.
tmp <- post %>% group_by(politician_name, month, date, year) %>% summarise(count = n())
summary(tmp$count)
tmp <- post %>% group_by(politician_name, month, year) %>% summarise(count = n())
summary(tmp$count)

# only 0.01 out of 100 posts contain false information. /// Using our text-based approach, about 0.01% (421) of politicians’ 
#posts contain false stories. 

table(post$fake_post)/nrow(post)*100

#0           1 
#99.98956088  0.01043912 

#The domain-based approach suggests a much larger share of posts containing misinformation (0.96%) 
table(post$base_url_fake)/nrow(post)*100

#0          1 
#99.0405184  0.9594816 

#Facebook URLs approach indicates a much smaller share (0.001%). 
table(post$full_fake_url)/nrow(post)*100

#0          1 
#99.9987602  0.0012398 


#Regardless of approach, however, less than 1% of all the content shared by politicians misinformation, confirming that 
#this practice is a rare event.
post %>%
  mutate(any_misinfo = if_else(fake_post == 1 |base_url_fake == 1| full_fake_url == 1, 1, 0)) %>%
  summarize(total = sum(any_misinfo)/n()*100)

#0.969

#posts containing misinformation using our text-based approach receive 10 times more engagement than posts without 
#misinformation (in terms of median number of reactions)

post %>% group_by(fake_post) %>% summarise(median_reaction = median(total_reactions, na.rm = T)) 

1182/120

#Overall, using at least one of the methods employed (text, domain, Facebook URL), we detect false content
#in 0.9694% of posts, and these 0.9694% of posts represent about 1.4197% of all the online
#engagement to politicians’ posts in the period analyzed
post <- post %>% mutate(misinfo_class = case_when(fake_post == 1 | #any misinfo variable
                                                    base_url_fake == 1  |
                                                    full_fake_url == 1 ~ "misinfo", 
                                                  TRUE ~ "not misinfo"))

prop.table(table(post$misinfo_class))*100

sumr <- post %>% filter(misinfo_class == "misinfo") %>% summarise(sumr = sum(total_reactions, na.rm = TRUE))
total <- sum(post$total_reactions, na.rm = TRUE)
(sumr$sumr/total)*100


#we do not find that posts with misinformation are seen disproportionately more relative to their size -- in fact, they are 
#seen at lower frequencies than posts without misinformation (only 0.06\% of views are to posts with any misinformation)
sumr <- post %>% filter(misinfo_class == "misinfo") %>% summarise(sumr = sum(total_views, na.rm = TRUE))
total <- sum(post$total_views, na.rm = TRUE)
(sumr$sumr/total)*100


#In over four million posts in our #data, more than 75% did not include a hyperlink to an external domain
#load all social media data
post <- post %>%
  mutate(has_url = ifelse(str_detect(all_text, "http"), 1, 0))

table(post$has_url)/nrow(post)
#25% have domains while 75% do not

#Note that 83% of all posts tagged by the text-based approach do not contain a reference to an
#external URL.

post %>%
  filter(fake_post == 1) %>%
  summarize(without_url = (sum(has_url == 0))/n())

#Of those that do reference an external URL, 4.5% are also captured in the domain-based approach. 
post %>%
  filter(fake_post == 1) %>%
  summarize(sum(base_url_fake)/n())

#From a random sample of 200 Boatos posts, we found 72 false stories that also appear in at least one of the five other 
#main fact checking websites in Brazil. The other 119 were only found in Boatos, and the remaining 9 were inconclusive 
#because the selected story was in fact a collection of false stories or the stories were too ambiguous so we could not 
#determine if they were reviewed by other fact-checking agencies.

boatos_sample <- read_excel("Data/Boatos Sample.xlsx", sheet = 1)

table(boatos_sample$`any_site?`) 

#0   1 
#119  72 

tmp <- boatos_sample %>% filter(is.na(`any_site?`))
tmp$comentarios

#Based on a random sample of 200 stories from the other five fact-checking agencies, we find that #about half of those 
#(104) were not in Boatos– stories are described differently in each fact-checking agency, so two research assistants 
#working independently reviewed each story because there may be disagreement about whether or not the agencies were, in fact, 
#reviewing the samecontent. Importantly, none of the stories that were examined by Boatos and the other agencies were 
#adjudicated differently by these agencies. Out of the 104 stories not in Boatos, 63% (66) were #classified as false. 
#For the remaining 38 stories, they have a combination of true stories (16), sets of campaign promises and platform statements, 
#stories that were about the fact-checking agencies themselves (not checked content), and stories that were not included in Boatos

sample_fc_ra1 <- read_excel("Data/Cópia de Comparison-Fact-Checking-Agencies-v1.xlsx", sheet = 1) %>% 
  mutate(ra = 1, presenca_boatos = ifelse(presenca_boatos == "Nao", "Não", presenca_boatos))
sample_fc_ra2 <- read_excel("Data/Comparison-Fact-Checking-Agencies-v2.xlsx", sheet = 1) %>% 
  mutate(ra = 2, presenca_boatos = ifelse(presenca_boatos == "Na", "Não", presenca_boatos))

combined <- bind_cols(sample_fc_ra1, sample_fc_ra2) #

table(sample_fc_ra1$presenca_boatos)

#Não Sim-diferente adjudicacao     Sim-mesma adjudicacao 
#106                         1                        93 

table(sample_fc_ra2$presenca_boatos)

#Não Sim-mesma adjudicacao 
#117                    82 


table(combined$`presenca_boatos...12`, combined$`presenca_boatos...27`)

#                        Não     Sim-mesma adjudicacao
#Não                       104                     1
#Sim-diferente adjudicacao   0                     1
#Sim-mesma adjudicacao      13                    80

tmp <- combined %>% filter(`presenca_boatos...12` == "Não" & `presenca_boatos...27` == "Não")
table(tmp$classification...1)
1+1+2+1+17+13+2+1 
prop.table(table(tmp$classification...1))*100
tmp2 <- tmp %>% filter(classification...1 != "FALSO")
13 + 2 +1


#We utilize both the exact URL from the dataset (N = 365) and the root domain (N = 228, excluding 
#the common domains such as youtube.com, twitter.com, yahoo.com, and a main newspaper)

#from fb url database
full_url <- read_excel("Data/url-fact-check.xlsx") %>%
  filter(is.na(suggest_removing)) #removing youtube, twitter, yahoo, folha

nrow(full_url) #365

#get unique base urls
root_url <- full_url %>%
  select(base_url) %>%
  distinct()
nrow(root_url) #228

#To assess where classification errors occurred at this stage, we reviewed 150 randomly sampled headlines.

#half of headlines classified as false, half classified as true
headlines_sample <- read_excel("Data/headlines_sample.xlsx") 

nrow(headlines_sample) #145

#based on RA coding
colnames(headlines_sample)[8] <- "sensationalist"
colnames(headlines_sample)[10] <- "news_article_esque"


classification_errors<- headlines_sample%>%
  select(prediction, news_article_esque, sensationalist) %>%
  mutate(true_not_newsy = ifelse(prediction == 1 & news_article_esque == "nao"|sensationalist == "sim", 1, 0),
         true_not_newsy = ifelse(prediction == 0, NA, true_not_newsy)) %>%
  mutate(false_newsy = ifelse(prediction == 0 & news_article_esque == "sim"|sensationalist == "nao", 1, 0),
         false_newsy = ifelse(prediction == 1, NA, false_newsy)) %>%
  mutate(opposite_of_classification = if_else(true_not_newsy == 1 | false_newsy == 1, 1, 0)) 

classification_errors <- classification_errors %>%
  mutate(opposite_of_classification = if_else(is.na(opposite_of_classification), 0,opposite_of_classification))

#We found that 78% of the articles selected from reputable news sources were misclassified as false when they deployed 
#sensationalist rhetoric, used clickbait, and otherwise did not resemble a news article headline.

classification_errors %>%
  filter(prediction == 1) %>%
  summarize(sum(true_not_newsy)/n()) #.78

#Where the classification model identified fake posts from Boatos as “real,” we found that 80% of the false content read more like 
#a news articles or did not use sensationalist language. 
table(classification_errors$opposite_of_classification) #24 out of 145 (79%) of false article were either not sensationalist or newslike

#In total, coders disagreed 241 times, or approximately 5.8% of all manually reviewed observations.
filenames <- list.files("Data/ra_tfidf_assessments", pattern="*.csv", full.names=TRUE)
assessments_ra <- lapply(filenames, read_csv)

facebook_joined <- left_join(assessments_ra[[1]],  assessments_ra[[4]], by = c("User.Name"= "User.Name","url_facebook"="url_facebook",
                                                                               "month_year"="month_year", "description_facebook"="description_facebook",
                                                                               "max_cosine_tfidf"="max_cosine_tfidf","url_boatos"="url_boatos",
                                                                               "description_boatos"="description_boatos"))  %>%
  mutate(diff_in_match = ifelse(`match?.x` == `match?.y`,  FALSE, TRUE))

table(facebook_joined$diff_in_match) 

#FALSE  TRUE 
#1742   147 
instagram_joined <- left_join(assessments_ra[[2]],  assessments_ra[[5]], by = c("User.Name"= "User.Name","url_instagram"="url_instagram",
                                                                                "month_year"="month_year", "description_clean_instagram"="description_clean_instagram",
                                                                                "max_cosine_tfidf"="max_cosine_tfidf","url_boatos"="url_boatos",
                                                                                "description_clean_boatos"="description_clean_boatos"))  %>%
  mutate(diff_in_match = ifelse(`match?.x` == `match?.y`,  FALSE, TRUE))

table(instagram_joined$diff_in_match) 

#FALSE  TRUE 
#738    34 
twitter_joined <- left_join(assessments_ra[[3]],  assessments_ra[[6]], by = c("post_id" = "post_id", "handle" = "handle", "url_twitter"="url_twitter",
                                                                              "description_clean_twitter"="description_clean_twitter",
                                                                              "max_cosine_tfidf"="max_cosine_tfidf","url_boatos"="url_boatos",
                                                                              "description_clean.y"="description_clean.y"))  %>%
  mutate(diff_in_match = ifelse(`match?.x` == `match?.y`,  FALSE, TRUE))

table(twitter_joined$diff_in_match) 

#FALSE  TRUE 
#1374    60

sum(nrow(facebook_joined), nrow(instagram_joined), nrow(twitter_joined)) #4101

#from above logical values
147+34+60
#241 differences in coding

241/4101
#[1] 0.05876615 disagreement

#We also took a random sample of 335 posts in which two coders agreed that they were not a match to a false story, and we had a 
#third coder review them. We found 6 (1.8%) instances in which the third coder classified the post as a match to a false story.

excel_sheets("Data/ra_negative_match_review/Classification-Matches-Positive-Negative.xlsx")

neg_facebook <- read_excel("Data/ra_negative_match_review/Classification-Matches-Positive-Negative.xlsx", sheet = "NegativeFacebook")
neg_twitter <- read_excel("Data/ra_negative_match_review/Classification-Matches-Positive-Negative.xlsx", sheet = "NegativeTwitter")
neg_ig <- read_excel("Data/ra_negative_match_review/Classification-Matches-Positive-Negative.xlsx", sheet = "NegativeInstagram")

neg_review <- bind_rows(neg_facebook, neg_twitter, neg_ig)

nrow(neg_review) #335

table(neg_review$final_match)
#match no match 
#6      329 

6/335 #.018

#To investigate these approaches’ ability to detect false stories, we select a random sample of 200 posts identified by the 
#domain approach as containing misinformation. Two research assistants examined the stories linked in the posts and the content 
#of the posts themselves against our full dataset of all fact-checking agencies (about 15,000 fact-checked stories). Out of the 
#139 posts with working links, 131 (95%) stories were not checked by any fact-checking agency. Of the 8 stories that were checked 
#by a fact-checking agency, 7 were deemed false  or misleading and 1 story was checked and deemed true. 

domain_ra1 <- read_excel("Data/sample-urls-dataset-RA1.xlsx")
domain_ra2 <- read_excel("Data/sample-urls-dataset-RA2.xlsx")

domain <- domain_ra1 %>% left_join(domain_ra2, by = "URL")
domain <- domain %>% mutate(off_both = case_when(`link off post.y` == "Sim" ~ 1, 
                                                 `link off post.x` == "sim" ~ 1, 
                                                 TRUE ~ 0), 
                            checked.x = ifelse(is.na(presenca_agencia.x), 0, 1), 
                            checked.y = ifelse(is.na(presenca_agencia.y), 0, 1))

table(domain$off_both)
#0   1 
#157  43 

online_checked <- domain %>% filter(checked.x == 1 & checked.y == 1 & off_both == 0)

table(online_checked$presenca_agencia.x, online_checked$presenca_agencia.y)

#Furthermore, only 5 out of a sample of 2,516 posts containing hyperlinks to domains listed in the domain approach were 
#fact-checked by the social media  companies (based on a visual tag by the social media company)
links <- read_excel("Data/links in posts.xlsx")

table(links$link_off)
nrow(links) - 569
table(links$tag)


#We examine this possibility empirically by reviewing 300 randomly selected posts with a probability of at least 0.9 containing a 
#false story and a cosine similarity between 0 and 0.4, from which 3 coders identified no (0) false stories. 

ra1 <- read_excel("Data/RA-false-negs-below0.4/Comparing Posts to Rumors -- check sample RA1.xlsx")
ra2 <- read_excel("Data/RA-false-negs-below0.4/Cópia de Comparing Posts to Rumors -- check sample RA2.xlsx")

combined <- bind_cols(ra1, ra2)

table(ra1$fake)
table(ra2$fake)

table(combined$`fake...10`, combined$`fake...21`)


#In the subset of posts #with cosine similarity greater than 0.4, the rate in which we found false stories was 10.3%. 

sum(post$fake_post)/sum(post$cosine_above)

#also possible that posts with a predicted probability of being false below our .9 threshold might still have a cosine 
#similarity > .4. We also evaluate this empirically by taking a random sample of 306 posts, stratified by year and platform. 
#Just three of 306 posts (.1%) produced a cosine match greater than .4 but less than .45. Only one of these featured a candidate 
#potentially sharing an ambiguously false claim, which we discuss in depth in the Online Appendix Section A.8.

####see line 328-494 for this analysis####

#To evaluate to what extent the domain approach detects hyper-partisan content, our research assistants classified the randomly 
#selected 200 news that were linked to the posts identified via the domain approach according to their level of 
#“politicization/partisanship,” understood as political content with a clear political/partisan slant. On average our coders found 
#that in 70% of posts, either the contents of the link or the text posted in social media, were “politicized/partisan,” or 
#“clearly partisan/politicized” and less than 20% were considered “neutral.” 

#url dataset sample
url_1 <- read_csv("Data/ra_sample_url_df/sample-urls-dataset-RA1 - Sheet1.csv") %>%
  filter(!is.na(partidario_politizado))
url_2 <- read_csv("Data/ra_sample_url_df/sample-urls-dataset-RA2 - Sheet1.csv")%>%
  filter(!is.na(partidario_politizado)) 

table(url_1$partidario_politizado)
(89+17)/140 #politicized/partisan or clearly politicized partisan
25/140 #neutral
table(url_2$partidario_politizado)
(60+34)/147#politicized/partisan or clearly politicized partisan
31/147 #neutral

(76+64)/2 #average for partisan
(18+21)/2 #average for neutral


#In an alternative analysis, we include only domains that were found at least twice in Facebook URLs dataset (N=59 domains)

full_url %>%
  select(base_url) %>%
  group_by(base_url) %>%
  mutate(total = n()) %>%
  filter(total > 1) %>%
  distinct() %>%
  ungroup() %>%
  summarize(n())

 #59

#Finally, using domains independently verified by the Global Disinformation Index (N=17 domains),
base_url_gdi<- read_csv("Data/base-url-gdi.csv")

nrow(base_url_gdi)

#17


#===================
#appendix tables: Online Engagement with Posts with and without Misinformation (Table 8-11)
#===================

#Table 8: Posts with Misinformation (Text) and Online Engagement
xtable(post %>% group_by(fake_post) %>% summarise(median_like = median(total_likes, na.rm = T),
                                                       avg_like = mean(total_likes, na.rm = T),
                                                       median_reaction = median(total_reactions, na.rm = T), 
                                                       avg_reaction = mean(total_reactions, na.rm = T), 
                                                       median_views = median(total_views, na.rm = T), 
                                                       avg_views = mean(total_views, na.rm = T)))

#Table 9: Posts with Misinformation (Domain) and Online Engagement
xtable(post %>% group_by(base_url_fake) %>% summarise(median_like = median(total_likes, na.rm = T),
                                                       avg_like = mean(total_likes, na.rm = T),
                                                       median_reaction = median(total_reactions, na.rm = T), 
                                                       avg_reaction = mean(total_reactions, na.rm = T), 
                                                       median_views = median(total_views, na.rm = T), 
                                                       avg_views = mean(total_views, na.rm = T)))

#Table 10: Posts with Misinformation (FB URL) and Online Engagement
xtable(post %>% group_by(full_fake_url) %>% summarise(median_like = median(total_likes, na.rm = T),
                                                      avg_like = mean(total_likes, na.rm = T),
                                                      median_reaction = median(total_reactions, na.rm = T), 
                                                      avg_reaction = mean(total_reactions, na.rm = T), 
                                                      median_views = median(total_views, na.rm = T), 
                                                      avg_views = mean(total_views, na.rm = T)))

#Table 11: Posts with Misinformation (any measure) and Online Engagement
xtable(post %>% group_by(misinfo_class) %>% summarise(median_like = median(total_likes, na.rm = T),
                                                      avg_like = mean(total_likes, na.rm = T),
                                                      median_reaction = median(total_reactions, na.rm = T), 
                                                      avg_reaction = mean(total_reactions, na.rm = T), 
                                                      median_views = median(total_views, na.rm = T), 
                                                      avg_views = mean(total_views, na.rm = T)))
